#######################################################################
#####                                                            ######
#####   Input: Debates data; dictionary words                    ######
#####   Output: Measure of debate topic                          ######
#####                                                            ######
#######################################################################

# Load libraries

library(quanteda) # v3.3.1
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.4
library(data.table) # v1.14.8
library(text2vec) # v0.6.3

# Source dictionaries for talking "about women" 

load("data/dictionaries/liwc.Rdata")

woman_words <- liwc$Female
man_words <- liwc$Male
dictionary_to_use_women <- dictionary(list(woman = woman_words,
                                           man = man_words))
## Load data 

load("data/debates.Rdata")

# Drop pre-Blair years 

debates <- debates[debates$parliamentary_term!="1992-1997"]

## Loop over years

year_out_list <- list()
i <- 0
y <- unique(debates$yearmon)[1]

for(y in unique(debates$yearmon)){
  
  print(y)
  i<-i+1
  
  debates_test <- debates[yearmon == y]
  
  # Turn to corpus 
  debates_test_corpus <- corpus(debates_test, text_field = "parent")
  
  ## Replace "negation word" with "negation_word"
  
  debates_dfm <- dfm(debates_test_corpus)
  
  ## Calculate dictionary counts
  
  dfm_liwc <- dfm(debates_dfm, dictionary = dictionary_to_use_women)
  
  dict_scores <- as.data.frame(as.matrix(dfm_liwc))
  names(dict_scores) <- paste0("dic_",names(dict_scores))
  
  ## Merge together 
  
  year_out <- data.table(epobject_id = docvars(debates_dfm)$epobject_id,
                         section_id = docvars(debates_dfm)$section_id,
                         person_id = docvars(debates_dfm)$person_id,
                         adjusted_sent = texts(debates_test_corpus), 
                         dict_scores)
  
  year_out_list[[i]] <- year_out
  
}

debate_title_scores_liwc <- data.table(rbind.fill(year_out_list))

save(debate_title_scores_liwc, file = "working/validation/debate_title_scores_liwc.Rdata")
